References
!pip install gymnasium[atari]
!pip install gymnasium[accept-rom-license]
!pip install tensorflow
!pip install numpy
!pip install torch
!pip install -U scikit-learn
!pip install optuna
!pip install stable-baselines3[extra]
!pip install cmaes
import os
import gymnasium as gym
from collections import deque
from typing import Any
from typing import Dict
from tqdm import tqdm
import numpy as np
# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import optuna
from optuna.trial import TrialState
# Load environment
env = gym.make("ALE/MsPacman-ram-v5")
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(50);
default_hyperparams = {
"epoch": 20, # max number of episodes per optimization trial
# "n_training_episodes": 5000, TODO: Delete
"max_t": 50000, # max number of steps per trial
# "env_id": "ALE/MsPacman-ram-v5", TODO: Delete
"state_space": 128, # RAM data for Atari console during game
"action_space": 5, # No-op, up, right, left, down
}
# Based off Optuna RL example code
# Changes by CS 175 project group: hyperparameters being sampled
def sample_hyperparams(trial: optuna.Trial) -> Dict[str, Any]:
"""Sampler for hyperparameters."""
gamma = trial.suggest_float("gamma", 0.99995, 1, log=True)
n_layers = trial.suggest_int("n_layers", 1, 5)
h_size = trial.suggest_int("h_size", 4, 1024)
dropout = trial.suggest_float("dropout", 0.0, 0.7, log=False)
lr = trial.suggest_float("lr", 1e-6, 1e-2, log=True)
longevity_exponential = trial.suggest_float("longevity_exponential", 1.001, 1.01, log=True)
step_penalty_multiplier = trial.suggest_float("step_penalty_multiplier", 1, 1.1, log=True)
# ghost_reward = trial.suggest_int("ghost_reward", -1000, 1000)
ghost_reward = 0
dot_extra_reward = trial.suggest_int("dot_extra_reward", 0, 20)
energy_pill_extra_reward = trial.suggest_int("energy_pill_extra_reward", 0, 100)
# optimizer = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
optimizer = "SGD"
# Display true values.
trial.set_user_attr("gamma_", gamma)
trial.set_user_attr("n_layers_", n_layers)
trial.set_user_attr("h_size_", h_size)
trial.set_user_attr("dropout_", dropout)
trial.set_user_attr("lr_", lr)
trial.set_user_attr("longevity_exponential_", longevity_exponential)
trial.set_user_attr("step_penalty_multiplier_", step_penalty_multiplier)
trial.set_user_attr("ghost_reward_", ghost_reward)
trial.set_user_attr("dot_extra_reward_", dot_extra_reward)
trial.set_user_attr("energy_pill_extra_reward_", energy_pill_extra_reward)
trial.set_user_attr("optimizer_", optimizer)
return {
"gamma": gamma,
"n_layers": n_layers,
"h_size": h_size,
"dropout": dropout,
"lr": lr,
"longevity_exponential": longevity_exponential,
"step_penalty_multiplier": step_penalty_multiplier,
"ghost_reward": ghost_reward,
"dot_extra_reward": dot_extra_reward,
"energy_pill_extra_reward": energy_pill_extra_reward,
"optimizer": optimizer
}
# Based off Hugging Face policy gradient code
# Changes by CS 175 project group:
# - class inherits nn.Sequential rather than nn.Module
# - change to constructor method and deletion of explicitly defined forward method
class Policy(nn.Sequential):
def __init__(self, n_layers, h_size, dropout, s_size, a_size):
layers = []
in_features = s_size
for i in range(n_layers):
layers.append(nn.Linear(in_features, h_size))
layers.append(nn.ReLU())
layers.append(nn.Dropout(dropout))
in_features = h_size
layers.append(nn.Linear(in_features, a_size))
layers.append(nn.LogSoftmax(dim=1))
super().__init__(*layers)
def act(self, state):
state = torch.from_numpy(state).float().unsqueeze(0).to(device)
probs = self.forward(state).cpu()
m = Categorical(probs)
action = m.sample()
return action.item(), m.log_prob(action)
# Contains policy trainer from Hugging Face policy gradient code
# Changes by CS 175 project group:
# - changes to reward for training
# - ensure changes to reward doesn't affect score output
# - added Optuna methods to evaluate episodes and prune trials if needed
# - cut out portions from original code not needed by trainer
def train(trial, policy, optimizer, epoch, max_t, gamma, ghost_reward, step_penalty_multiplier,
longevity_exponential=0, dot_extra_reward=0, energy_pill_extra_reward=0):
for i_epoch in range(epoch + 1):
saved_log_probs = []
rewards = []
state,game_env = env.reset()
# Variables for reward changes
step_num = 0
score_adjustments = 0
rewards_this_life = 0
step_penalty = 1
cur_step_penalty = step_penalty
for t in range(max_t):
old_game_env = game_env
action, log_prob = policy.act(state)
saved_log_probs.append(log_prob)
state, reward, done, _, game_env = env.step(action)
# Longevity reward. More reward gathered for each life, larger reward
if old_game_env["lives"] > game_env["lives"]:
longevity_reward = longevity_exponential ** rewards_this_life
rewards_this_life = 0
reward += longevity_reward
score_adjustments -= longevity_reward
rewards.append(reward)
continue
reward_change = 0
# Equal penalty for eating ghost
if reward // 100 == 2:
reward_change = reward - 200 + ghost_reward
score_adjustments += 200 - ghost_reward
elif reward // 100 == 4:
reward_change = reward - 400 + ghost_reward
score_adjustments += 400 - ghost_reward
elif reward // 100 == 8:
reward_change = reward - 800 + ghost_reward
score_adjustments += 800 - ghost_reward
elif reward // 100 == 16:
reward_change = reward - 1600 + ghost_reward
score_adjustments += 1600 - ghost_reward
# Penalty for going many steps without eating dot
if reward % 100 == 10:
cur_step_penalty = step_penalty
reward_change += dot_extra_reward
score_adjustments -= dot_extra_reward
elif reward % 100 == 50:
cur_step_penalty = step_penalty
reward_change += energy_pill_extra_reward
score_adjustments -= energy_pill_extra_reward
else:
cur_step_penalty *= step_penalty_multiplier
reward_change -= step_penalty
score_adjustments += step_penalty
rewards.append(reward + reward_change)
if done:
break
final_score = sum(rewards) + score_adjustments
returns = deque(maxlen=max_t)
n_steps = len(rewards)
# Compute the discounted returns at each timestep,
# as the sum of the gamma-discounted return at time t (G_t) + the reward at time t
## We compute this starting from the last timestep to the first, to avoid redundant computations
## appendleft() function of queues appends to the position 0
## We use deque instead of lists to reduce the time complexity
for t in range(n_steps)[::-1]:
disc_return_t = rewards[t] + gamma * (disc_return_t if t + 1 < n_steps else 0)
returns.appendleft(disc_return_t)
## standardization for training stability
eps = np.finfo(np.float32).eps.item()
## eps is added to the standard deviation of the returns to avoid numerical instabilities
returns = torch.tensor(returns)
returns = (returns - returns.mean()) / (returns.std() + eps)
policy_loss = []
for log_prob, disc_return in zip(saved_log_probs, returns):
policy_loss.append(-log_prob * disc_return)
policy_loss = torch.cat(policy_loss).sum()
optimizer.zero_grad()
policy_loss.backward()
optimizer.step()
trial.report(final_score, i_epoch)
# Handle pruning based on the intermediate value.
if trial.should_prune():
raise optuna.exceptions.TrialPruned()
return final_score
# Based off Optuna simple example code
# Changes by CS 175 project group:
# - replaced original policy with policy for Ms Pacman
# - consolidated training code into separate function (previous code box)
def objective(trial):
hyperparameters = {**default_hyperparams, **sample_hyperparams(trial)}
# Generate the model.
policy = Policy(hyperparameters["n_layers"], hyperparameters["h_size"],
hyperparameters["dropout"], hyperparameters["state_space"],
hyperparameters["action_space"]).to(device)
# Generate the optimizers.
optimizer_name = hyperparameters["optimizer"]
optimizer = getattr(optim, optimizer_name)(policy.parameters(), lr=hyperparameters["lr"])
score = train(trial, policy, optimizer, epoch=hyperparameters["epoch"],
max_t=hyperparameters["max_t"], gamma=hyperparameters["gamma"],
ghost_reward=hyperparameters["ghost_reward"],
step_penalty_multiplier=hyperparameters["step_penalty_multiplier"],
longevity_exponential=hyperparameters["longevity_exponential"],
dot_extra_reward=hyperparameters["dot_extra_reward"],
energy_pill_extra_reward=hyperparameters["energy_pill_extra_reward"],
)
return score
# Create an Optuna study
# Study info will be saved at path given to "storage" parameter as .db file
study = optuna.create_study(study_name="MsPacMan_study",
direction="maximize",
# Recommend default sampler and pruner for <1000 trials
# Comment out following two lines to use default sampler and pruner
# sampler=optuna.samplers.CmaEsSampler(consider_pruned_trials=False),
# pruner=optuna.pruners.HyperbandPruner()
)
[I 2023-12-11 11:12:15,663] A new study created in memory with name: MsPacMan_study
# Load saved study
study = optuna.load_study(study_name="MsPacMan_study", storage="sqlite:///Studies/MsPacMan_study.db")
# Start Optuna study
# show_progress_bar=True wouldn't work on Jupyter Notebook without installing Google Colab package
# n_jobs: number of parallel jobs
study.optimize(objective, n_trials=50, timeout=None, n_jobs=1, gc_after_trial=True, show_progress_bar=False)
[I 2023-12-11 11:12:43,250] Trial 0 finished with value: 230.0 and parameters: {'gamma': 0.9999692031511148, 'n_layers': 4, 'h_size': 256, 'dropout': 0.1762122397011413, 'lr': 0.0002502409509623826, 'longevity_exponential': 1.0047816572131334, 'step_penalty_multiplier': 1.0779003102967768, 'dot_extra_reward': 20, 'energy_pill_extra_reward': 57}. Best is trial 0 with value: 230.0.
[I 2023-12-11 11:13:02,236] Trial 1 finished with value: 200.0 and parameters: {'gamma': 0.9999753440414736, 'n_layers': 3, 'h_size': 412, 'dropout': 0.6145633028030688, 'lr': 0.00020370932422462964, 'longevity_exponential': 1.0057163897294066, 'step_penalty_multiplier': 1.0529321019556988, 'dot_extra_reward': 20, 'energy_pill_extra_reward': 76}. Best is trial 0 with value: 230.0.
[I 2023-12-11 11:13:24,555] Trial 2 finished with value: 200.0 and parameters: {'gamma': 0.9999722852096964, 'n_layers': 4, 'h_size': 372, 'dropout': 0.40882190345708047, 'lr': 3.385352126959437e-05, 'longevity_exponential': 1.0077938949853686, 'step_penalty_multiplier': 1.0899462254339654, 'dot_extra_reward': 7, 'energy_pill_extra_reward': 20}. Best is trial 0 with value: 230.0.
[I 2023-12-11 11:14:18,432] Trial 3 finished with value: 220.0 and parameters: {'gamma': 0.9999859381666313, 'n_layers': 3, 'h_size': 967, 'dropout': 0.05000946918026199, 'lr': 3.644208074592394e-05, 'longevity_exponential': 1.009943284717741, 'step_penalty_multiplier': 1.0037587742205414, 'dot_extra_reward': 10, 'energy_pill_extra_reward': 66}. Best is trial 0 with value: 230.0.
[I 2023-12-11 11:14:29,042] Trial 4 finished with value: 200.0 and parameters: {'gamma': 0.9999771592838165, 'n_layers': 2, 'h_size': 16, 'dropout': 0.6027795147012084, 'lr': 1.7620923530872803e-05, 'longevity_exponential': 1.0036798711668844, 'step_penalty_multiplier': 1.0278501161835476, 'dot_extra_reward': 19, 'energy_pill_extra_reward': 60}. Best is trial 0 with value: 230.0.
[I 2023-12-11 11:14:50,764] Trial 5 finished with value: 170.0 and parameters: {'gamma': 0.9999892535357757, 'n_layers': 4, 'h_size': 415, 'dropout': 0.28736803249840215, 'lr': 0.0036511920726623607, 'longevity_exponential': 1.006319323210271, 'step_penalty_multiplier': 1.0017418504252942, 'dot_extra_reward': 5, 'energy_pill_extra_reward': 6}. Best is trial 0 with value: 230.0.
[I 2023-12-11 11:15:40,088] Trial 6 finished with value: 290.0 and parameters: {'gamma': 0.9999861632147989, 'n_layers': 3, 'h_size': 830, 'dropout': 0.29617055176534335, 'lr': 0.0004917749348627604, 'longevity_exponential': 1.0033305391643417, 'step_penalty_multiplier': 1.0285037869184064, 'dot_extra_reward': 15, 'energy_pill_extra_reward': 64}. Best is trial 6 with value: 290.0.
[I 2023-12-11 11:15:40,942] Trial 7 pruned.
[I 2023-12-11 11:15:41,921] Trial 8 pruned.
[I 2023-12-11 11:15:42,799] Trial 9 pruned.
[I 2023-12-11 11:15:46,224] Trial 10 pruned.
[I 2023-12-11 11:15:49,263] Trial 11 pruned.
[I 2023-12-11 11:15:50,349] Trial 12 pruned.
[I 2023-12-11 11:15:51,475] Trial 13 pruned.
[I 2023-12-11 11:15:52,729] Trial 14 pruned.
[I 2023-12-11 11:16:12,176] Trial 15 finished with value: 250.0 and parameters: {'gamma': 0.9999808234768816, 'n_layers': 4, 'h_size': 154, 'dropout': 0.12090439946790885, 'lr': 0.00017369346396499275, 'longevity_exponential': 1.0035306163227822, 'step_penalty_multiplier': 1.0424610615800955, 'dot_extra_reward': 18, 'energy_pill_extra_reward': 58}. Best is trial 6 with value: 290.0.
[I 2023-12-11 11:16:13,601] Trial 16 pruned.
[I 2023-12-11 11:16:14,404] Trial 17 pruned.
[I 2023-12-11 11:16:16,870] Trial 18 pruned.
[I 2023-12-11 11:16:34,830] Trial 19 finished with value: 310.0 and parameters: {'gamma': 0.9999926127055915, 'n_layers': 3, 'h_size': 178, 'dropout': 0.0069241572543159435, 'lr': 0.0001014520595303218, 'longevity_exponential': 1.002464702839554, 'step_penalty_multiplier': 1.0457572624287927, 'dot_extra_reward': 11, 'energy_pill_extra_reward': 93}. Best is trial 19 with value: 310.0.
[I 2023-12-11 11:16:36,392] Trial 20 pruned.
[I 2023-12-11 11:16:57,782] Trial 21 finished with value: 650.0 and parameters: {'gamma': 0.9999931906352822, 'n_layers': 3, 'h_size': 169, 'dropout': 0.006649481779098021, 'lr': 0.00010041720505743183, 'longevity_exponential': 1.0030306466772165, 'step_penalty_multiplier': 1.0403995699975574, 'dot_extra_reward': 9, 'energy_pill_extra_reward': 73}. Best is trial 21 with value: 650.0.
[I 2023-12-11 11:16:58,512] Trial 22 pruned.
[I 2023-12-11 11:16:59,421] Trial 23 pruned.
[I 2023-12-11 11:17:00,375] Trial 24 pruned.
[I 2023-12-11 11:17:18,804] Trial 25 finished with value: 210.0 and parameters: {'gamma': 0.9999886717895855, 'n_layers': 3, 'h_size': 237, 'dropout': 0.21330213401198192, 'lr': 1.1509465567672235e-05, 'longevity_exponential': 1.001035991088817, 'step_penalty_multiplier': 1.0416126981983824, 'dot_extra_reward': 11, 'energy_pill_extra_reward': 75}. Best is trial 21 with value: 650.0.
[I 2023-12-11 11:17:38,563] Trial 26 finished with value: 340.0 and parameters: {'gamma': 0.9999962717663855, 'n_layers': 2, 'h_size': 329, 'dropout': 0.06399755120130299, 'lr': 7.442700511327361e-05, 'longevity_exponential': 1.0040731706591077, 'step_penalty_multiplier': 1.0304364800076233, 'dot_extra_reward': 4, 'energy_pill_extra_reward': 86}. Best is trial 21 with value: 650.0.
[I 2023-12-11 11:17:39,403] Trial 27 pruned.
[I 2023-12-11 11:17:40,249] Trial 28 pruned.
[I 2023-12-11 11:17:57,110] Trial 29 finished with value: 300.0 and parameters: {'gamma': 0.9999957696202204, 'n_layers': 2, 'h_size': 211, 'dropout': 0.1351672126242052, 'lr': 0.00029714725045713083, 'longevity_exponential': 1.0050929060501523, 'step_penalty_multiplier': 1.0660298363608978, 'dot_extra_reward': 0, 'energy_pill_extra_reward': 83}. Best is trial 21 with value: 650.0.
[I 2023-12-11 11:17:58,133] Trial 30 pruned.
[I 2023-12-11 11:17:59,355] Trial 31 pruned.
[I 2023-12-11 11:18:00,135] Trial 32 pruned.
[I 2023-12-11 11:18:22,801] Trial 33 finished with value: 380.0 and parameters: {'gamma': 0.9999921682544429, 'n_layers': 3, 'h_size': 194, 'dropout': 0.1358679172202715, 'lr': 5.434784683786154e-05, 'longevity_exponential': 1.0028078106929046, 'step_penalty_multiplier': 1.0530090630882818, 'dot_extra_reward': 6, 'energy_pill_extra_reward': 94}. Best is trial 21 with value: 650.0.
[I 2023-12-11 11:18:43,621] Trial 34 finished with value: 330.0 and parameters: {'gamma': 0.9999911743807219, 'n_layers': 3, 'h_size': 345, 'dropout': 0.06748343572692499, 'lr': 5.424840934115718e-05, 'longevity_exponential': 1.00280734024581, 'step_penalty_multiplier': 1.051590334980659, 'dot_extra_reward': 6, 'energy_pill_extra_reward': 91}. Best is trial 21 with value: 650.0.
[I 2023-12-11 11:18:44,687] Trial 35 pruned.
[I 2023-12-11 11:18:45,810] Trial 36 pruned.
[I 2023-12-11 11:18:46,971] Trial 37 pruned.
[I 2023-12-11 11:18:48,075] Trial 38 pruned.
[I 2023-12-11 11:18:48,772] Trial 39 pruned.
[I 2023-12-11 11:18:49,644] Trial 40 pruned.
[I 2023-12-11 11:18:50,877] Trial 41 pruned.
[I 2023-12-11 11:18:52,173] Trial 42 pruned.
[I 2023-12-11 11:19:14,969] Trial 43 finished with value: 320.0 and parameters: {'gamma': 0.9999875621704095, 'n_layers': 3, 'h_size': 326, 'dropout': 0.087581568152122, 'lr': 0.00010853859356981085, 'longevity_exponential': 1.0023722116305447, 'step_penalty_multiplier': 1.0312161519450713, 'dot_extra_reward': 8, 'energy_pill_extra_reward': 91}. Best is trial 21 with value: 650.0.
[I 2023-12-11 11:19:16,025] Trial 44 pruned.
[I 2023-12-11 11:19:17,050] Trial 45 pruned.
[I 2023-12-11 11:19:17,909] Trial 46 pruned.
[I 2023-12-11 11:19:19,003] Trial 47 pruned.
[I 2023-12-11 11:19:20,037] Trial 48 pruned.
[I 2023-12-11 11:19:20,901] Trial 49 pruned.
# Recommended hyperparameters from Optuna study
# Exact code from Optuna simple example
pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])
print("Study statistics: ")
print(" Number of finished trials: ", len(study.trials))
print(" Number of pruned trials: ", len(pruned_trials))
print(" Number of complete trials: ", len(complete_trials))
print("Best trial:")
trial = study.best_trial
print(" Value: ", trial.value)
print(" Params: ")
for key, value in trial.params.items():
print(" {}: {}".format(key, value))
Study statistics:
Number of finished trials: 50
Number of pruned trials: 34
Number of complete trials: 16
Best trial:
Value: 650.0
Params:
gamma: 0.9999931906352822
n_layers: 3
h_size: 169
dropout: 0.006649481779098021
lr: 0.00010041720505743183
longevity_exponential: 1.0030306466772165
step_penalty_multiplier: 1.0403995699975574
dot_extra_reward: 9
energy_pill_extra_reward: 73
# Importance evaluation for each hyperparameter from Optuna study
optuna.visualization.plot_param_importances(study)
# print("Importances:")
# for key, value in optuna.importance.get_param_importances(study).items():
# print(key, ":", value)